import selenium  #This is equivalent to the R command library(selenium)
from selenium import webdriver #Importing the function we need
from bs4 import BeautifulSoup
import re
from selenium.webdriver.common.by import By
import pandas as pd
import time
import os
import json


#Run file in the script directory

cwd = os.getcwd()
path = cwd[:-7]

os.chdir(path)


driver = webdriver.Chrome() #Load our browser
link_start = 'https://catalog.archives.gov/search-within/4325222?limit=10000&q=Docket%20'
dockets = {}
for year in ['96','97','98','99','00','01']:
    link = link_start+year
    driver.get(link)
    page_html = driver.page_source
    soup = BeautifulSoup(page_html, 'html.parser')
    t = soup.get_text()
    docket = []
    r = t.split('File Unit')
    for d in r:
        try:
            name = re.findall('Docket ('+year+'-[0-9]{1,5})',d)[0]
            naid = re.findall(r'NAID: (\d+)',d)[0]
            docket.append((name,naid))
        except:
            continue
    dockets[year] = docket

driver.quit()

file = open('Data/Processed_Data/Lower_Court_Dockets/archives_dockets.csv','w')
file.write('Docket,NAID\n')
for term in dockets:
    for docket in dockets[term]:
        file.write(docket[0]+','+docket[1]+'\n')
file.close()


data = pd.read_csv('Data/Processed_Data/Lower_Court_Dockets/archives_dockets.csv')

link_start='https://s3.amazonaws.com/NARAprodstorage/opastorage/live/'
end_part = '/content/electronic-records/rg-267/appellate/'
driver = webdriver.Chrome() #Load our browser
docket_info = {}
for i in range(len(data)):
    if i>=0:
        d = data['Docket'][i]
        naid = str(data['NAID'][i])
        mid_link = str(int(naid[-2:]))+'/'
        next_link = naid[2:6]+'/'
        link = link_start+mid_link+next_link+naid+end_part+d+'.HTM'
        time.sleep(1)
        driver.get(link)
        page_html = driver.page_source
        soup = BeautifulSoup(page_html, 'html.parser')
        t = soup.get_text()
        docket_info[d]=t

with open('Data/Processed_Data/Lower_Court_Dockets/Supreme_Court_Dockets_Raw_Text_1996_2001.json', 'w') as f:
        json.dump(docket_info, f, indent=4)